#!/usr/bin/env python
# coding: utf-8

# In[1]:


import pandas as pd
import numpy as np
import ast
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score
from sklearn.model_selection import GridSearchCV
import os
os.chdir("..")
print(os.getcwd())

import random

# Set the random seed
random.seed(10012)


# In[2]:


dataset_name = 'stwts'


# In[3]:


embed_types = ['cvec_pca16', 'cvec_nmf16', 'cvec_umap16', 'cvec_tsne16', 'bert', 'roberta', 'distil', 'glove6B', 'universal', 'lda100']
selection_types = ['dopt', 'kmeans', 'kld', 'ks', 'cos', 'recon']
counts = [50, 100, 250, 500, 750, 1000, 1500, 2000, 2500, 3000]
#bootstrap_iters = [1,2]
bootstrap_iters = [1,2,3,4,5,6,7,8,9,10]


# ## Results Dict
# 
# 

# In[4]:


key_list = ['random_iter' + str(h) for h in bootstrap_iters] + [str(i)+'_'+str(j) + '_iter' + str(h) for i in embed_types for j in selection_types for h in bootstrap_iters]
acc_dict = {k: [] for k in key_list}
f1_dict = {k: [] for k in key_list}
roc_dict = {k: [] for k in key_list}


# ## Complete dataset

# In[5]:


## Read the full version and the test set lists
testset_list = pd.read_csv("data/output/" +dataset_name+'_testset_list.csv')

data = pd.read_csv("data/output/" +dataset_name+'_cvec_full.csv', index_col=0)
max_obs = len(data)

y_full = data['label']
X_full = data.drop(columns=['label'])

if dataset_name == 'stwts':
    y_full = [int(x == 'Bullish') for x in y_full]
    y_full = pd.Series(y_full)
    

print(len(testset_list.columns))
print(max_obs)


# In[6]:


def Diff(li1, li2):
    return list(set(li1) - set(li2)) + list(set(li2) - set(li1))

parameters = {'alpha':[1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 2, 4, 6, 8, 10, 12, 14, 16]}

indices_list_fullset = []

for i in range(len(testset_list.columns)):
    ## Loop through the test set lists and make train/test splits
    idx_test = testset_list["iter"+str(i+1)]
    idx_train = Diff(range(1, max_obs), idx_test)
    X_test = X_full.iloc[idx_test]
    X_train = X_full.iloc[idx_train]
    y_train = y_full.iloc[idx_train]
    y_test = y_full.iloc[idx_test]
    gscv = GridSearchCV(MultinomialNB(), parameters, verbose=1, scoring='roc_auc')
    gscv.fit(X_train, y_train)
    mnb = gscv.best_estimator_
    print(gscv.best_params_)
    a = accuracy_score(y_test, mnb.predict(X_test))
    b = f1_score(y_test, mnb.predict(X_test))
    c = roc_auc_score(y_test, mnb.predict_proba(X_test)[:,1])
    indices_list_fullset.append([a,b,c])


# In[7]:


## Output the complete set results
fullset_out = pd.DataFrame(indices_list_fullset)
fullset_out.columns = ["acc", "f1", "roc"]
fullset_out.to_csv("data/output/" +"fullset_out.csv", index = True)


# ## Random Pick

# In[8]:


for i in range(len(testset_list.columns)):
    # Outer loop: loop through bootstrap iters
    idx_test = testset_list["iter"+str(i+1)]
    idx_train = Diff(range(1, max_obs), idx_test)
    X_test = X_full.iloc[idx_test]
    X_train = X_full.iloc[idx_train]
    y_train = y_full.iloc[idx_train]
    y_test = y_full.iloc[idx_test]
    indices_list = []
    with open("data/output/" +"indices_" + dataset_name + "_random_iter" + str(i+1) + ".txt") as fh: 
      lines = fh.readlines()
      for line in lines:
        indices_list.append(ast.literal_eval(line))
    # Inner loop: loop through train set sizes
    for lst in indices_list:
      gscv = GridSearchCV(MultinomialNB(), parameters, verbose=0, scoring='roc_auc')
      gscv.fit(X_full.iloc[lst], y_full.iloc[lst])
      mnb = gscv.best_estimator_
      #print(gscv.best_params_)
      acc_dict['random_iter' + str(i+1)].append(accuracy_score(y_test, mnb.predict(X_test)))
      f1_dict['random_iter' + str(i+1)].append(f1_score(y_test, mnb.predict(X_test)))
      roc_dict['random_iter' + str(i+1)].append(roc_auc_score(y_test, mnb.predict_proba(X_test)[:,1]))


# ## K-means Clustering

# In[9]:


for i in range(len(testset_list.columns)):
    # Outer loop: loop through bootstrap iters
    idx_test = testset_list["iter"+str(i+1)]
    idx_train = Diff(range(1, max_obs), idx_test)
    X_test = X_full.iloc[idx_test]
    X_train = X_full.iloc[idx_train]
    y_train = y_full.iloc[idx_train]
    y_test = y_full.iloc[idx_test]
    for j in range(len(embed_types)):
      print('iter' + str(i) +  'embed_type' + str(j))
      indices_list = []
      with open("data/output/" +'indices_'+dataset_name+'_'+embed_types[j]+'_kmeans_iter' + str(i+1) + '.txt') as fh:
          lines = fh.readlines() 
          for line in lines:
            indices_list.append(ast.literal_eval(line))
      for lst in indices_list:
        gscv = GridSearchCV(MultinomialNB(), parameters, verbose=0, scoring='roc_auc')
        gscv.fit(X_full.iloc[lst], y_full.iloc[lst])
        mnb = gscv.best_estimator_
        acc_dict[embed_types[j]+'_kmeans_iter' + str(i+1)].append(accuracy_score(y_test, mnb.predict(X_test)))
        f1_dict[embed_types[j]+'_kmeans_iter' + str(i+1)].append(f1_score(y_test, mnb.predict(X_test)))
        try:
            roc_dict[embed_types[j]+'_kmeans_iter' + str(i+1)].append(roc_auc_score(y_test, mnb.predict_proba(X_test)[:,1]))
        except:
            roc_dict[embed_types[j]+'_kmeans_iter' + str(i+1)].append(0)


# ## Greedy farthest points based on KL Divergence

# In[10]:


for i in range(len(testset_list.columns)):
    # Outer loop: loop through bootstrap iters
    idx_test = testset_list["iter"+str(i+1)]
    idx_train = Diff(range(1, max_obs), idx_test)
    X_test = X_full.iloc[idx_test]
    X_train = X_full.iloc[idx_train]
    y_train = y_full.iloc[idx_train]
    y_test = y_full.iloc[idx_test]
    indices_list = []
    for j in range(len(embed_types)):
      with open("data/output/" +'indices_'+dataset_name+'_'+embed_types[j]+'_kld_iter' + str(i+1) + '.txt') as fh:
        lines = fh.readlines()
        for line in lines:
          indices_list.append(ast.literal_eval(line))
    for j, lst in enumerate(indices_list):
      for c in counts:
        gscv = GridSearchCV(MultinomialNB(), parameters, verbose=0, scoring='roc_auc')
        gscv.fit(X_full.iloc[lst[:c]], y_full.iloc[lst[:c]])
        mnb = gscv.best_estimator_
        acc_dict[embed_types[j]+'_kld_iter' + str(i+1)].append(accuracy_score(y_test, mnb.predict(X_test)))
        f1_dict[embed_types[j]+'_kld_iter' + str(i+1)].append(f1_score(y_test, mnb.predict(X_test)))
        try:
            roc_dict[embed_types[j]+'_kld_iter' + str(i+1)].append(roc_auc_score(y_test, mnb.predict_proba(X_test)[:,1]))
        except:
            roc_dict[embed_types[j]+'_kld_iter' + str(i+1)].append("NA")


# ## Greedy farthest points based on Kolmogorov Smirnov statistics

# In[11]:


for i in range(len(testset_list.columns)):
    idx_test = testset_list["iter"+str(i+1)]
    idx_train = Diff(range(1, max_obs), idx_test)
    X_test = X_full.iloc[idx_test]
    X_train = X_full.iloc[idx_train]
    y_train = y_full.iloc[idx_train]
    y_test = y_full.iloc[idx_test]
    # Outer loop: loop through bootstrap iters
    indices_list = []
    for j in range(len(embed_types)):
      with open("data/output/" +'indices_'+dataset_name+'_'+embed_types[j]+'_ks_iter' + str(i+1) + '.txt') as fh:
        lines = fh.readlines()
        for line in lines:
          indices_list.append(ast.literal_eval(line))
    for j, lst in enumerate(indices_list):
      for c in counts:
        gscv = GridSearchCV(MultinomialNB(), parameters, verbose=0, scoring='roc_auc')
        gscv.fit(X_full.iloc[lst[:c]], y_full.iloc[lst[:c]])
        mnb = gscv.best_estimator_
        acc_dict[embed_types[j]+'_ks_iter' + str(i+1)].append(accuracy_score(y_test, mnb.predict(X_test)))
        f1_dict[embed_types[j]+'_ks_iter' + str(i+1)].append(f1_score(y_test, mnb.predict(X_test)))
        roc_dict[embed_types[j]+'_ks_iter' + str(i+1)].append(roc_auc_score(y_test, mnb.predict_proba(X_test)[:,1]))


# ## Greedy farthest points based on cosine distance

# In[12]:


for i in range(len(testset_list.columns)):
    idx_test = testset_list["iter"+str(i+1)]
    idx_train = Diff(range(1, max_obs), idx_test)
    X_test = X_full.iloc[idx_test]
    X_train = X_full.iloc[idx_train]
    y_train = y_full.iloc[idx_train]
    y_test = y_full.iloc[idx_test]
    # Outer loop: loop through bootstrap iters
    indices_list = []
    for j in range(len(embed_types)):
      with open("data/output/" +'indices_'+dataset_name+'_'+embed_types[j]+'_cos_iter' + str(i+1) + '.txt') as fh:
        lines = fh.readlines()
        for line in lines:
          indices_list.append(ast.literal_eval(line))
    for j, lst in enumerate(indices_list):
      for c in counts:
        gscv = GridSearchCV(MultinomialNB(), parameters, verbose=0, scoring='roc_auc')
        gscv.fit(X_full.iloc[lst[:c]], y_full.iloc[lst[:c]])
        mnb = gscv.best_estimator_
        acc_dict[embed_types[j]+'_cos_iter' + str(i+1)].append(accuracy_score(y_test, mnb.predict(X_test)))
        f1_dict[embed_types[j]+'_cos_iter' + str(i+1)].append(f1_score(y_test, mnb.predict(X_test)))
        roc_dict[embed_types[j]+'_cos_iter' + str(i+1)].append(roc_auc_score(y_test, mnb.predict_proba(X_test)[:,1]))


# ## Reconstruction Loss Minimization

# In[13]:


embed_types2 = ['cvec_pca16', 'cvec_nmf16', 'cvec_umap16', 'cvec_tsne16', 'bert', 'roberta', 'distil']#, 'distil', 'glove6B', 'universal', 'lda100']


for i in range(len(testset_list.columns)):
    # Outer loop: loop through bootstrap iters
    idx_test = testset_list["iter"+str(i+1)]
    idx_train = Diff(range(1, max_obs), idx_test)
    X_test = X_full.iloc[idx_test]
    X_train = X_full.iloc[idx_train]
    y_train = y_full.iloc[idx_train]
    y_test = y_full.iloc[idx_test]
    indices_list = []
    for j in range(len(embed_types2)):
      indices_list.append(list(np.load("data/output/" +'indices_'+dataset_name+'_'+embed_types2[j]+'_recon_iter' + str(i+1) + '.npy')))
    for j, lst in enumerate(indices_list):
      for c in counts:
        gscv = GridSearchCV(MultinomialNB(), parameters, verbose=0, scoring='roc_auc')
        gscv.fit(X_full.iloc[lst[:c]], y_full.iloc[lst[:c]])
        mnb = gscv.best_estimator_
        acc_dict[embed_types2[j]+'_recon_iter' + str(i+1)].append(accuracy_score(y_test, mnb.predict(X_test)))
        f1_dict[embed_types2[j]+'_recon_iter' + str(i+1)].append(f1_score(y_test, mnb.predict(X_test)))
        roc_dict[embed_types2[j]+'_recon_iter' + str(i+1)].append(roc_auc_score(y_test, mnb.predict_proba(X_test)[:,1]))


# ## Taddy's d-optimality

# In[14]:


# Taddy d-opt test from kmeans
for i in range(len(testset_list.columns)):
    # Outer loop: loop through bootstrap iters
    idx_test = testset_list["iter"+str(i+1)]
    idx_train = Diff(range(1, max_obs), idx_test)
    X_test = X_full.iloc[idx_test]
    X_train = X_full.iloc[idx_train]
    y_train = y_full.iloc[idx_train]
    y_test = y_full.iloc[idx_test]
    for j in range(len(embed_types)):
      print('iter' + str(i) +  'embed_type' + str(j))
      indices_list = []
      with open("data/output/" +'indices_'+dataset_name+'_'+embed_types[j]+'_dopt_iter' + str(i+1) + '.txt') as fh:
          lines = fh.readlines() 
          for line in lines:
            indices_list.append(ast.literal_eval(line))
      for lst in indices_list:
        gscv = GridSearchCV(MultinomialNB(), parameters, verbose=0, scoring='roc_auc')
        gscv.fit(X_full.iloc[lst], y_full.iloc[lst])
        mnb = gscv.best_estimator_
        acc_dict[embed_types[j]+'_dopt_iter' + str(i+1)].append(accuracy_score(y_test, mnb.predict(X_test)))
        f1_dict[embed_types[j]+'_dopt_iter' + str(i+1)].append(f1_score(y_test, mnb.predict(X_test)))
        roc_dict[embed_types[j]+'_dopt_iter' + str(i+1)].append(roc_auc_score(y_test, mnb.predict_proba(X_test)[:,1]))

        #try:
        #    roc_dict[embed_types[j]+'_dopt_iter' + str(i+1)].append(roc_auc_score(y_test, mnb.predict_proba(X_test)[:,1]))
        #except:
        #    roc_dict[embed_types[j]+'_dopt_iter' + str(i+1)].append(0)


# ## Save dicts to csv

# In[15]:


pd.DataFrame.from_dict(acc_dict, orient='index').transpose().to_csv("results/" +dataset_name + '_acc_mnb.csv')
pd.DataFrame.from_dict(f1_dict, orient='index').transpose().to_csv("results/" +dataset_name +'_f1_mnb.csv')
pd.DataFrame.from_dict(roc_dict, orient='index').transpose().to_csv("results/" +dataset_name +'_roc_mnb.csv')

